#################################################################################
# Filename: 1_round1_loaddata.R
# Description: Loads raw survey data from the Round 1 study and performs cleaning
#################################################################################

### Load library dependencies 
library(tidyverse)
library(broom)
library(DeclareDesign)
library(estimatr)
library(rio)
library(ggplot2)
library(gridExtra)

### Set your directory 
setwd(" < YOUR DIRECTORY HERE > ") 

### Import raw data 
rm(list=ls())
dat <- import("Round_1_raw.Rdata")
dat <- dat[which(is.na(dat$incomplete)==FALSE),]    # Keep only complete cases

# Note: We have already removed duplicate entries based on IP Address and survey ID; 
# since this pre-processing was based on data that would compromise anonymity, 
# we have performed this in advance of other cleaning 

### Convert survey responses to numeric outcomes (5 is strongest, 1 is weakest)

dat[dat=="强烈赞同"] <- 5
dat[dat=="赞同"] <- 4
dat[dat=="既不赞同也不反对"] <- 3
dat[dat=="反对"] <- 2
dat[dat=="强烈反对"] <- 1

dat[dat=="极其强烈"] <- 5
dat[dat=="较为强烈"] <- 4
dat[dat=="有一些"] <- 3
dat[dat=="仅有一点点"] <- 2
dat[dat=="完全没有或几乎没有"] <- 1

dat[dat=="非常大"] <- 5
dat[dat=="比较大"] <- 4
dat[dat=="适中"] <- 3
dat[dat=="比较小"] <- 2
dat[dat=="非常小"] <- 1

dat[dat=="强烈同意"] <- 5
dat[dat=="同意"] <- 4
dat[dat=="中⽴"] <- 3
dat[dat=="反对"] <- 2
dat[dat=="强烈反对"] <- 1

dat[dat=="增加"] <- 3
dat[dat=="保持不变"] <- 2
dat[dat=="减少"] <- 1
dat[dat=="不知道/拒绝回答"] <- NA

dat$Q3[dat$Q3=="18-24"] <- 1
dat$Q3[dat$Q3=="25-29"] <- 2
dat$Q3[dat$Q3=="30-34"] <- 3
dat$Q3[dat$Q3=="35-39"] <- 4
dat$Q3[dat$Q3=="40-44"] <- 5
dat$Q3[dat$Q3=="45-49"] <- 6
dat$Q3[dat$Q3=="50-54"] <- 7
dat$Q3[dat$Q3=="55-59"] <- 8
dat$Q3[dat$Q3=="60-64"] <- 9
dat$Q3[dat$Q3=="65岁以上"] <- 10

dat$age <- as.numeric(dat$Q3)


### Make variables numeric

dat$Q28 <- as.numeric(dat$Q28)
dat$Q25 <- as.numeric(dat$Q25)
dat$Q53 <- as.numeric(dat$Q53)
dat$Q26 <- as.numeric(dat$Q26)
dat$Q52 <- as.numeric(dat$Q52)
dat$Q19_1 <- as.numeric(dat$Q19_1)
dat$Q19_2 <- as.numeric(dat$Q19_2)
dat$Q19_3 <- as.numeric(dat$Q19_3)
dat$Q19_4 <- as.numeric(dat$Q19_4)
dat$Q21_1 <- as.numeric(dat$Q21_1)
dat$Q21_2 <- as.numeric(dat$Q21_2)
dat$Q21_3 <- as.numeric(dat$Q21_3)
dat$Q21_4 <- as.numeric(dat$Q21_4)
dat$Q54_1 <- as.numeric(dat$Q54_1)
dat$Q54_2 <- as.numeric(dat$Q54_2)
dat$Q54_3 <- as.numeric(dat$Q54_3)
dat$Q54_4 <- as.numeric(dat$Q54_4)
dat$Q55_1 <- as.numeric(dat$Q55_1)
dat$Q55_2 <- as.numeric(dat$Q55_2)
dat$Q55_3 <- as.numeric(dat$Q55_3)
dat$Q55_4 <- as.numeric(dat$Q55_4)
dat$Q56_1 <- as.numeric(dat$Q56_1)
dat$Q56_2 <- as.numeric(dat$Q56_2)
dat$Q56_3 <- as.numeric(dat$Q56_3)
dat$Q56_4 <- as.numeric(dat$Q56_4)
dat$Q14 <- as.numeric(dat$Q14)


### Create protest variables: 
#     "protest.gov" is encoded based on whether people responded yes to "participate in walking movement"
#     "protest.japan.all" includes people who "agree" with the petition regardless of whether they signed
#     "protest.japan.signed" includes people who "sign" the petition with their surname: 

dat$protest.gov <- 0
dat$protest.gov[dat$Q32==""] <- NA
dat$protest.gov[grep("参加游行活动", dat$Q32)] <- 1

dat$protest.japan.all <- NA
dat$protest.japan.all[dat$Q33=="愿意（请仅填写姓氏）"] <- 1
dat$protest.japan.all[dat$Q33=="不愿意"] <- 0

dat$protest.japan.signed <- dat$protest.japan.all
dat$protest.japan.signed[which(dat$Q33_1_TEXT==""&dat$protest.japan.all==1)] <- 0


# Indicators for which type of propaganda respondents were exposed to
#   (Note that we did some poor raw variable naming, "yongbu" is not actually related to
#   the name of the hard propaganda clip )
dat$soft <- NA
dat$soft[dat$clip=="leopard"] <- 1
dat$soft[dat$clip=="none"] <- 0

dat$hard <- NA
dat$hard[dat$clip=="yongbu"] <- 1 
dat$hard[dat$clip=="none"] <- 0

### Create PCA indices for different types of nationalism 
# Patriotism 
dv.names = c("Q53", "Q56_1", "Q56_2", "Q56_3")
factor.obj = princomp(~ Q53+Q56_1+Q56_2+Q56_3, data=dat, cor=TRUE, na.action = na.omit)
dat$patriot_pca = NA
dat$patriot_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$patriot_pca <- scale(dat$patriot_pca)

# Antiforeign nationalsim 
dv.names = c("Q25","Q26", "Q54_1", "Q54_2", "Q54_3")
factor.obj = princomp(~ Q25+Q26+Q54_1+Q54_2+Q54_3, data=dat, cor=TRUE, na.action = na.omit)
dat$antiforeign_pca = NA
dat$antiforeign_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$antiforeign_pca <- scale(dat$antiforeign_pca)

# Militant nationalsim 
dv.names = c("Q28", "Q26", "Q52")
factor.obj = princomp(~ Q28+Q52+Q26, data=dat, cor=TRUE, na.action = na.omit)
dat$militant_pca = NA
dat$militant_pca[complete.cases(dat[,dv.names])] = as.vector(factor.obj$scores[,1])
dat$militant_pca <- scale(dat$militant_pca)

# Reorder to factors
dat$clip <- factor(dat$clip, levels = c("none", "leopard", "yongbu"))

### Create other variables: 
# Gender (1 if female)
dat$gender <- NA
dat$gender[dat$Q4=="女"] <- 1
dat$gender[dat$Q4=="男"] <- 0

# Attention check for clips
dat$soft_attn <- dat[,55]
dat$attent <- 0
dat$attent[dat$soft_attn=="他被杀害了"] <- 1
dat$attent[dat$clip=="none"] <- 1
dat$attent[dat$Q74=="国旗降半旗"] <- 1

dat$urban.hukou <- 0
dat$urban.hukou[dat$Q8=="城镇"] <- 1
dat$urban.hukou[is.na(dat$Q8)] <- NA


dat$urban.residence <- 0
dat$urban.residence[dat$Q7=="大城市"] <- 1
dat$urban.residence[is.na(dat$Q7)] <- NA


dat$education <- NA
dat$education[dat$Q9=="小学或以下"] <- 1
dat$education[dat$Q9=="初中"] <- 2
dat$education[dat$Q9=="高中" ] <- 3
dat$education[dat$Q9== "大专"] <- 4
dat$education[dat$Q9=="大学本科" ] <- 5
dat$education[dat$Q9=="研究生或以上学历"] <- 6


dat$party <- NA
dat$party[dat$Q10=="共产党员"] <- 1
dat$party[dat$Q10=="群众"] <- 0
dat$party[dat$Q10=="共青团员"] <- 0
dat$party[dat$Q10=="民主党派人士"] <- 0

dat$income <- NA
dat$income[dat$Q11=="不清楚/拒绝回答"   ] <- NA
dat$income[dat$Q11==""   ] <- NA
dat$income[dat$Q11=="少于10,000"] <- 1
dat$income[dat$Q11=="10,001~30,000元"] <- 2
dat$income[dat$Q11== "30,001-60,000 元"] <- 3
dat$income[dat$Q11=="60,001-90,000 元"] <- 4
dat$income[dat$Q11=="90,001-120,000 元"] <- 5
dat$income[dat$Q11=="120,001-200,000元"] <- 6
dat$income[dat$Q11=="高于200,000元"] <- 7

dat$college.education <- dat$education
dat$college.education[dat$education<5] <- 0
dat$college.education[dat$education>=5] <- 1

dat$under.40 <- NA
dat$under.40[dat$age<5] <- 1
dat$under.40[dat$age>=5] <- 0

dat$income.over.200k <- NA
dat$income.over.200k[dat$income<7] <- 0
dat$income.over.200k[dat$income>6] <- 1




### Save the data
save(dat, file = "Round_1_clean.Rdata")


